setwd("/Users/ab6415/Documents/Work_laptop_stuff/Bioinformatics/Analysis/EpiMALs_PAPER_ANALYSES_Dryad/08_TE_analysis_M1/")
library(ggplot2)
library(MASS)
library(viridis)
## Loading required package: viridisLite
library(msir)
## Package 'msir' version 1.3.2
## Type 'citation("msir")' for citing this R package in publications.
overall_sd<-function(row){
sqsum=0
ncomps=0
for (i in seq(length(row))){
for (j in seq(length(row))){
if (i>j){
sqsum=sqsum+(row[j]-row[i])**2
ncomps=ncomps+1
}}}
return(sqrt(sqsum/ncomps))
}
overall_sd_tendatapoints<-function(row){
sqsum=vector()
ncomps=vector()
for (i in seq(length(row))){
for (j in seq(length(row))){
if (i>j){
sqsum=c(sqsum,(row[j]-row[i])**2)
}}}
sqsum_sampled=sample(sqsum,size=10, replace = FALSE, prob = NULL)
sqsum<-sum(sqsum_sampled)
return(sqrt(sqsum/10))
}
intergenerational_sd<-function(row){
sqsum=0
ncomps=0
for (i in seq(length(row)-1)){
if (i != 6){
sqsum=sqsum+(row[i+1]-row[i])**2
ncomps=ncomps+1
}}
return(sqrt(sqsum/ncomps))
}
#density
get_density <- function(x, y, ...) {
dens <- MASS::kde2d(x, y, ...)
ix <- findInterval(x, dens$x)
iy <- findInterval(y, dens$y)
ii <- cbind(ix, iy)
return(dens$z[ii])
}
loess_fit_and_plotres<-function(df,x,y,thr_type,thr){
dat<-df
#dat<-dat[which(dat[,"log10_mean"]>0),]
l<-loess.sd(x = dat[,x],y = dat[,y], nsigma = 1.96)
l_fit<-data.frame(x=l$x,y=l$y,sd=l$sd,upper=l$upper,lower=l$lower,ID=dat$ID)
l_fit$Z<-(dat[,y]-l_fit$y)/l_fit$sd
l_fit$pvalue<-pnorm(l_fit$Z,lower.tail = FALSE)
l_fit$ID<-dat$ID
if (thr_type=="fdr"){
l_fit_padj<-l_fit[which(l_fit$x>0.9999999),]
l_fit_padj$padj<-p.adjust(l_fit_padj$pvalue,method = "fdr")
dat$sig<-rep(0,nrow(dat))
dat[which(dat$ID %in% l_fit_padj[which(l_fit_padj$padj<thr),"ID"]),"sig"]<-1
}
else if (thr_type=="pval"){
dat$sig<-rep(0,nrow(dat))
dat[which(dat$ID %in% l_fit[which(l_fit$pvalue<thr),"ID"]),"sig"]<-1
dat[which(dat$log10_mean<1),"sig"]<-0
}
dat$density<-get_density(dat[,x],dat[,y], n = 100)
print(ggplot(dat)+geom_point(aes(dat[,x],dat[,y],color=density))+ scale_color_viridis()+geom_line(aes(x=l_fit$x,y=l_fit$lower),color="pink",linetype="dashed")+geom_line(aes(x=l_fit$x,y=l_fit$y),color="pink")+geom_line(aes(x=l_fit$x,y=l_fit$upper),color="pink",linetype="dashed")+geom_point(data=subset(dat,sig==1),aes(dat[which(dat$sig==1),x],dat[which(dat$sig==1),y]),color="red")+geom_hline(yintercept = -0.6,linetype="dashed")+ylab(y)+xlab(x))
dat<-merge(dat,l_fit[,c("pvalue","ID","Z")],by="ID")
return(dat[which(dat$sig==1),])
}
setwd("/Users/ab6415/Documents/Work_laptop_stuff/Bioinformatics/Analysis/EpiMALs_PAPER_ANALYSES_Dryad/08_TE_analysis_M1/")
ttg_DEseqnorm<-read.table("22G_counts/final_counts_table/all_counts_M1_deseqnorm_averaged.txt")
ttg_A_DEseqnorm<-ttg_DEseqnorm[,1:12]
dat_ttgA<-data.frame(mean=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=mean),
sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=sd),
overall_sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=overall_sd),
inter_sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=intergenerational_sd),
subsampled_overall_sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=overall_sd_tendatapoints),
ID=rownames(ttg_A_DEseqnorm),
max=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN = max))
dat_ttgA<-dat_ttgA[which(dat_ttgA$mean>0),]
#calculate log10cv2 values
dat_ttgA$log10_overall_cv2<-log10((dat_ttgA$overall_sd/dat_ttgA$mean)**2)
dat_ttgA$log10_inter_cv2<-log10((dat_ttgA$inter_sd/dat_ttgA$mean)**2)
dat_ttgA$log10_overallsub_cv2<-log10((dat_ttgA$subsampled_overall_sd/dat_ttgA$mean)**2)
dat_ttgA$log10_mean<-log10(dat_ttgA$mean+1)
dat_ttgA$log10_cv2<-log10((dat_ttgA$sd/dat_ttgA$mean)**2)
dat_ttgA$ff<-(dat_ttgA$overall_sd**2)/dat_ttgA$mean
dat_ttgA<-dat_ttgA[order(dat_ttgA$log10_mean,decreasing = FALSE),]
#plot
dat_ttgA$density<-get_density(dat_ttgA$log10_mean,dat_ttgA$log10_overall_cv2, n = 100)
ggplot(dat_ttgA)+geom_point(aes(log10_mean,log10_overall_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2")

dat_ttgA_noInf<-dat_ttgA[-which(dat_ttgA$log10_overallsub_cv2== (-Inf)),]
dat_ttgA_noInf$density<-get_density(dat_ttgA_noInf$log10_mean,dat_ttgA_noInf$log10_overallsub_cv2, n = 100)
ggplot(dat_ttgA_noInf)+geom_point(aes(log10_mean,log10_overallsub_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2, subsampled")

dat_ttgA$density<-get_density(dat_ttgA$log10_mean,dat_ttgA$log10_inter_cv2, n = 100)
ggplot(dat_ttgA)+geom_point(aes(log10_mean,log10_inter_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, intergenerational cv2")
## Warning: Removed 1 rows containing missing values (geom_point).

ttgA_fdr0.2<-loess_fit_and_plotres(dat_ttgA,"log10_mean","log10_overall_cv2","fdr",0.2)

ttgA_pvalue0.01<-loess_fit_and_plotres(dat_ttgA,"log10_mean","log10_overall_cv2","pval",0.01)

ttg_B_DEseqnorm<-ttg_DEseqnorm[,13:24]
dat_ttgB<-data.frame(mean=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=mean),
sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=sd),
overall_sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=overall_sd),
inter_sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=intergenerational_sd),
subsampled_overall_sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=overall_sd_tendatapoints),
ID=rownames(ttg_B_DEseqnorm),
max=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN = max))
dat_ttgB<-dat_ttgB[which(dat_ttgB$mean>0),]
#calculate log10cv2 values
dat_ttgB$log10_overall_cv2<-log10((dat_ttgB$overall_sd/dat_ttgB$mean)**2)
dat_ttgB$log10_inter_cv2<-log10((dat_ttgB$inter_sd/dat_ttgB$mean)**2)
dat_ttgB$log10_overallsub_cv2<-log10((dat_ttgB$subsampled_overall_sd/dat_ttgB$mean)**2)
dat_ttgB$log10_mean<-log10(dat_ttgB$mean+1)
dat_ttgB$log10_cv2<-log10((dat_ttgB$sd/dat_ttgB$mean)**2)
dat_ttgB$ff<-(dat_ttgB$overall_sd**2)/dat_ttgB$mean
dat_ttgB<-dat_ttgB[order(dat_ttgB$log10_mean,decreasing = FALSE),]
#plot
dat_ttgB$density<-get_density(dat_ttgB$log10_mean,dat_ttgB$log10_overall_cv2, n = 100)
ggplot(dat_ttgB)+geom_point(aes(log10_mean,log10_overall_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2")

dat_ttgB_noInf<-dat_ttgB[-which(dat_ttgB$log10_overallsub_cv2== (-Inf)),]
dat_ttgB_noInf$density<-get_density(dat_ttgB_noInf$log10_mean,dat_ttgB_noInf$log10_overallsub_cv2, n = 100)
ggplot(dat_ttgB_noInf)+geom_point(aes(log10_mean,log10_overallsub_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2, subsampled")

dat_ttgB$density<-get_density(dat_ttgB$log10_mean,dat_ttgB$log10_inter_cv2, n = 100)
ggplot(dat_ttgB)+geom_point(aes(log10_mean,log10_inter_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, intergenerational cv2")

ttgB_fdr0.2<-loess_fit_and_plotres(dat_ttgB,"log10_mean","log10_overall_cv2","fdr",0.2)

ttgB_pvalue0.01<-loess_fit_and_plotres(dat_ttgB,"log10_mean","log10_overall_cv2","pval",0.01)

length(ttgA_fdr0.2$ID); length(ttgB_fdr0.2$ID)
## [1] 177
## [1] 116
length(intersect(ttgA_fdr0.2$ID,ttgB_fdr0.2$ID))
## [1] 39
length(ttgA_pvalue0.01$ID); length(ttgB_pvalue0.01$ID)
## [1] 278
## [1] 251
length(intersect(ttgA_pvalue0.01$ID,ttgB_pvalue0.01$ID))
## [1] 90
write.table(ttgA_fdr0.2$ID,file="HV22Gs_ttgA_fdr0.2.txt",quote = FALSE)
write.table(ttgB_fdr0.2$ID,file="HV22Gs_ttgB_fdr0.2.txt",quote = FALSE)
write.table(ttgA_pvalue0.01$ID,file="HV22Gs_ttgA_pval0.01.txt",quote = FALSE)
write.table(ttgB_pvalue0.01$ID,file="HV22Gs_ttgB_pval0.01.txt",quote = FALSE)
#get all permutations of 12 data points --> calculate cv2 using consecutive dps --> how does it compare to the actual cv2?
#permutations(n=12,r=12,v=1:12,repeats.allowed = FALSE)
#too many combinations and becomes computationally infeasible
n_perms_cv2<-function(row){
N<-100000
cv2s<-rep(0,N)
for (rep in seq(N)){
perm<-sample(row,size=12,replace=FALSE)
cv2s[rep]<-intergenerational_sd(perm)
}
obs_cv2<-intergenerational_sd(row)
p_value<-(length(which(cv2s<=obs_cv2))+1)/N
return(p_value)
}
n_perms_cv2_morevar<-function(row){
N<-100000
cv2s<-rep(0,N)
for (rep in seq(N)){
perm<-sample(row,size=12,replace=FALSE)
cv2s[rep]<-intergenerational_sd(perm)
}
obs_cv2<-intergenerational_sd(row)
p_value<-(length(which(cv2s>=obs_cv2))+1)/N
return(p_value)
}
#calculate pvals genome-wide (just ran it once with 10^5 reps and now I'm opening the saved file)
atleast_ten_ttgs_A<-dat_ttgA[which(dat_ttgA$mean>=10),"ID"]
atleast_ten_ttgs_B<-dat_ttgA[which(dat_ttgB$mean>=10),"ID"]
write.table(union(atleast_ten_ttgs_A,atleast_ten_ttgs_B),file="at_least_10_22Gs_in_atleast_onelin.txt")
#removed genes with <10 mean 22Gs to gain statistical power
#ttg_A_lessvar_p<-apply(ttg_A_DEseqnorm[which(rownames(ttg_A_DEseqnorm) %in% atleast_ten_ttgs_A),],MARGIN = 1,FUN=n_perms_cv2)
#ttg_B_lessvar_p<-apply(ttg_B_DEseqnorm[which(rownames(ttg_B_DEseqnorm) %in% atleast_ten_ttgs_B),],MARGIN = 1,FUN=n_perms_cv2)
#
# morevar_p<-apply(ttg_A_DEseqnorm[which(rownames(ttg_A_DEseqnorm) %in% atleast_ten_ttgs),],MARGIN = 1,FUN=n_perms_cv2_morevar)
# #morevar_padj<-p.adjust(morevar_p,method = "fdr")
#
#write.table(ttg_A_lessvar_p,file="ttgA_pvalues_lessvar_100000reps.txt",quote=FALSE)
#write.table(ttg_B_lessvar_p,file="ttgB_pvalues_lessvar_100000reps.txt",quote=FALSE)
#write.table(morevar_p,file="../gene_lists/pvalues_morevar_100000reps.txt",quote=FALSE)
#
# #open saved files
#
lessvar_p_A<-read.table("ttgA_pvalues_lessvar_100000reps.txt")
lessvar_df_A<-data.frame(pval=lessvar_p_A$x,padj=p.adjust(lessvar_p_A$x,method="fdr"),ID=rownames(lessvar_p_A))
lessvar_p_B<-read.table("ttgB_pvalues_lessvar_100000reps.txt")
lessvar_df_B<-data.frame(pval=lessvar_p_B$x,padj=p.adjust(lessvar_p_B$x,method="fdr"),ID=rownames(lessvar_p_B))
ggplot(lessvar_df_A)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linA_pvalhist.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_A)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linA_padjhist.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linB_pvalhist.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linB_padjhist.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(lessvar_df_A$padj<0.1))
## [1] 265
length(which(lessvar_df_B$padj<0.1))
## [1] 387
length(which(lessvar_df_A$padj<0.2))
## [1] 666
length(which(lessvar_df_B$padj<0.2))
## [1] 1021
length(intersect(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"]))
## [1] 70
length(union(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"]))
## [1] 582
#188 in lineage A, 354 in lineage B, overlap 58
length(intersect(lessvar_df_A[which(lessvar_df_A$padj<0.2),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.2),"ID"]))
## [1] 216
length(union(lessvar_df_A[which(lessvar_df_A$padj<0.2),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.2),"ID"]))
## [1] 1471
#565 in lineage A, 848 in lineage B, overlap 203
write.table(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],file="inter_overall_genes_linA_FDR0.1.txt")
write.table(lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"],file="inter_overall_genes_linB_FDR0.1.txt")
write.table(union(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"]),file="inter_overall_genes_linsAB_FDR0.1.txt")
write.table(lessvar_df_A[,"ID"],file="inter_overall_genes_linA_BACKGROUND_LIST.txt")
write.table(lessvar_df_B[,"ID"],file="inter_overall_genes_linB_BACKGROUND_LIST.txt")
removegenes<-read.table("removegenes.txt"); removegenes<-as.character(removegenes$x)
lessvar_df_A_nobatch<-lessvar_df_A[-which(lessvar_df_A$ID %in% removegenes),]
lessvar_df_B_nobatch<-lessvar_df_B[-which(lessvar_df_B$ID %in% removegenes),]
ggplot(lessvar_df_A_nobatch)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linA_pvalhist_nobatch.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_A_nobatch)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linA_padjhist_nobatch.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B_nobatch)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linB_pvalhist_nobatch.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B_nobatch)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linB_padjhist_nobatch.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(lessvar_df_A_nobatch$padj<0.1))
## [1] 167
length(which(lessvar_df_B_nobatch$padj<0.1))
## [1] 253
length(which(lessvar_df_A_nobatch$padj<0.2))
## [1] 447
length(which(lessvar_df_B_nobatch$padj<0.2))
## [1] 743
length(intersect(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"]))
## [1] 36
length(union(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"]))
## [1] 384
#114 in lineage A, 237 in lineage B, overlap 30, union 321
length(intersect(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.2),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.2),"ID"]))
## [1] 105
length(union(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.2),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.2),"ID"]))
## [1] 1085
#364 in lineage A, 606 in lineage B, overlap 97, union 873
write.table(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],file="inter_overall_genes_linA_FDR0.1_nobatch.txt")
write.table(lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"],file="inter_overall_genes_linB_FDR0.1_nobatch.txt")
write.table(union(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"]),file="inter_overall_genes_linsAB_FDR0.1_nobatch.txt")
top20<-union(lessvar_df_A[order(lessvar_df_A$padj),"ID"][1:20],lessvar_df_B[order(lessvar_df_B$padj),"ID"][1:20])
for (gene in top20){
plot(1:24,as.numeric(ttg_DEseqnorm[gene,1:24]),main=gene)
}







































